MLG Filter Statistics

Functions

See the source for details

Directory for RAD seq data

RAD_data_dir <- "~/Documents/Grunwald/short-scripts/Genotype_error/"

Analysis of P. infestans

library('poppr')
## Loading required package: adegenet
## Loading required package: ade4
##    ==========================
##     adegenet 1.4-2 is loaded
##    ==========================
## 
##  - to start, type '?adegenet'
##  - to browse adegenet website, type 'adegenetWeb()'
##  - to post questions/comments: adegenet-forum@lists.r-forge.r-project.org
## 
## 
## This is poppr version 1.1.4.99.125. To get started, type package?poppr
library('ape')
library('phangorn')
library('animation')
infdat <- RCurl::getURL("https://raw.githubusercontent.com/grunwaldlab/phytophthora_id/master/shiny-server/www/genoid_infestans/reduced_database.txt.csv")
infdat <- read.table(text = infdat, head = TRUE)
pinf <- df2genind(infdat[-c(1,2)], sep = "/", ploidy = 3, ind.names = infdat[[1]], pop = infdat[[2]])
ssr <- c(3,3,2,3,3,2,2,3,3,3,3,3)
x <- as.genclone(pinf)

fstats <- filter_stats(x, bruvo.dist, plot = TRUE, replen = ssr, loss = FALSE, nclone = 18)
title(main = expression(paste(italic("P. infestans"), " reference isolates (12 SSR loci)")))

# # 
# tiff(filename = "images/pinf_cluster.tiff", width = 85, height = 68, units = "mm", res = 1200, pointsize = 6)
# fstats <- filter_stats(x, bruvo.dist, plot = TRUE, replen = ssr, loss = FALSE, nclone = 18)
# title(main = expression(paste(italic("P. infestans"), " reference isolates (12 SSR loci)")))
# dev.off()

# legend("topright", legend = c("Nearest Neighbor", "UPGMA", "Farthest Neighbor"), 
#        col = c("red", "black", "blue"), pch = 1, title = "Clustering Method")

The plot above shows how multilocus genotypes collapse under differing algorithms over genetic distance.

Below, we will collapse MLGs with a threshold of an average of 2 mutational steps over all loci and create contingency tables relating the clustered MLGs to the previously defined MLGs (eg. US-8).

z <- filter_stats(x, bruvo.dist, replen = ssr, loss = FALSE, threshold = 0.75/12, stats = "MLGS")
print(table(pop(x), z$farthest), zero.print = ".")
##        
##         1 3 5 8 9 10 12 13 15 17 19 20 22 23 24 25 27 28 29
##   B     . . . . .  .  .  .  1  .  .  .  .  .  .  .  .  .  .
##   C     . 1 . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   EU-13 . . . . 1  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   EU-4  . . . . .  1  .  .  .  .  .  .  .  .  .  .  .  .  .
##   EU-8  . . . . .  .  .  1  .  .  .  .  .  .  .  .  .  .  .
##   EU-5  . . . . .  .  2  .  .  .  .  .  .  .  .  .  .  .  .
##   US-8  . . . . .  .  .  .  .  .  .  1  .  .  2  1  .  .  .
##   D.1   1 . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   D.2   1 . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-11 . . . . .  .  .  .  .  2  .  .  .  .  .  .  .  .  .
##   US-12 . . 1 . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-14 . . . . .  .  .  .  .  .  .  .  1  .  .  .  .  .  .
##   US-17 . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  1  .
##   US-20 . . . . .  .  .  .  .  .  .  .  .  .  .  .  2  .  .
##   US-21 . . . . .  .  .  .  .  .  2  .  .  .  .  .  .  .  .
##   US-22 . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  2
##   US-23 . . . 3 .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-24 . . . . .  .  .  .  .  .  .  .  .  3  .  .  .  .  .
print(table(pop(x), z$nearest), zero.print = ".")
##        
##         1 3 5 6 9 10 12 13 15 17 19 20 23 24 25 27 28 29
##   B     . . . . .  .  .  .  1  .  .  .  .  .  .  .  .  .
##   C     . 1 . . .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   EU-13 . . . . 1  .  .  .  .  .  .  .  .  .  .  .  .  .
##   EU-4  . . . . .  1  .  .  .  .  .  .  .  .  .  .  .  .
##   EU-8  . . . . .  .  .  1  .  .  .  .  .  .  .  .  .  .
##   EU-5  . . . . .  .  2  .  .  .  .  .  .  .  .  .  .  .
##   US-8  . . . . .  .  .  .  .  .  .  1  .  2  1  .  .  .
##   D.1   1 . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   D.2   1 . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-11 . . . . .  .  .  .  .  2  .  .  .  .  .  .  .  .
##   US-12 . . 1 . .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-14 . . . . .  .  .  .  .  .  .  .  .  1  .  .  .  .
##   US-17 . . . . .  .  .  .  .  .  .  .  .  .  .  .  1  .
##   US-20 . . . . .  .  .  .  .  .  .  .  .  .  .  2  .  .
##   US-21 . . . . .  .  .  .  .  .  2  .  .  .  .  .  .  .
##   US-22 . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  2
##   US-23 . . . 3 .  .  .  .  .  .  .  .  .  .  .  .  .  .
##   US-24 . . . . .  .  .  .  .  .  .  .  3  .  .  .  .  .
tab <- mlg.table(x, bar = FALSE)
colnames(tab) <- 1:ncol(tab)
print.table(tab, zero.print = ".") # contingency table for zero tolerance MLGs.
##       1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## B     . . . . . . . . .  .  .  .  .  .  1  .  .  .  .  .  .  .  .  .  .  .
## C     . . 1 . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## EU-13 . . . . . . . . 1  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## EU-4  . . . . . . . . .  1  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## EU-8  . . . . . . . . .  .  .  .  1  .  .  .  .  .  .  .  .  .  .  .  .  .
## EU-5  . . . . . . . . .  .  1  1  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-8  . . . . . . . . .  .  .  .  .  1  .  .  .  .  .  1  .  .  .  1  1  .
## D.1   1 . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## D.2   . 1 . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-11 . . . . . . . . .  .  .  .  .  .  .  1  1  .  .  .  .  .  .  .  .  .
## US-12 . . . . 1 . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-14 . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  1  .  .  .  .
## US-17 . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-20 . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  1
## US-21 . . . . . . . . .  .  .  .  .  .  .  .  .  1  1  .  .  .  .  .  .  .
## US-22 . . . 1 . . . . .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-23 . . . . . 1 1 1 .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .  .
## US-24 . . . . . . . . .  .  .  .  .  .  .  .  .  .  .  .  1  .  2  .  .  .
##       27 28 29
## B      .  .  .
## C      .  .  .
## EU-13  .  .  .
## EU-4   .  .  .
## EU-8   .  .  .
## EU-5   .  .  .
## US-8   .  .  .
## D.1    .  .  .
## D.2    .  .  .
## US-11  .  .  .
## US-12  .  .  .
## US-14  .  .  .
## US-17  .  1  .
## US-20  1  .  .
## US-21  .  .  .
## US-22  .  .  1
## US-23  .  .  .
## US-24  .  .  .

Most of the MLGs were able to be resolved. US-8 and D.1 and D.2 are not exactly resolved, but that is shown in the trees produced from the commented code below.

# Uncomment to regenerate plots
# 
# for (i in names(fstats)){
#   HTML_collapse(measure = i, x = x, treefun = "upgma", 
#                 distfun = "bruvo.dist", fstats = fstats, destdir = "images", 
#                 replen = ssr, loss = FALSE)  
#   HTML_collapse(measure = i, x = x, treefun = "nj", 
#                 distfun = "bruvo.dist", fstats = fstats, destdir = "images", 
#                 replen = ssr, loss = FALSE)  
# }

Simulated data

This will create 20 populations with 20 samples and 10k SNPs. Each population will have:

  • n = 20
  • 10,000 snps
  • an error rate of 0.1

In addition, half of these populations will have undergone one generation of clonal reproduction.

set.seed(20150412)
x <- lapply(1:10, getSims, n = 20, snps = 1e3, strucrat = 1, ploidy = 2, err = 0.05, na.perc = 0.21, clone = TRUE, n.cores = 4)
## Loading required package: parallel
y <- lapply(1:10, getSims, n = 20, snps = 1e3, strucrat = 1, ploidy = 2, err = 0.05, na.perc = 0.21, clone = FALSE, n.cores = 4)
# x <- getSims(n = 200, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, clone = TRUE, n.cores = 4)
# y <- getSims(n = 200, snps = 1e4, strucrat = 1, ploidy = 2, err = 0.1, clone = FALSE, n.cores = 4)

For analysis, \(\frac{1}{5}\)th of the pooled samples will be kept.

x <- do.call("rbind", c(x, y))
x <- x[sample(nInd(x), nInd(x)/5)]
trueclones <- duplicated(substr(indNames(x), start = 1, stop = 10))
fstats <- filter_stats(x, bitwise.dist, plot = TRUE, nclone = sum(!trueclones))

# the_threshold <- fstats$average$thresholds[sum(trueclones)] + .Machine$double.eps^0.5
the_threshold <- threshold_predictor(fstats$average$thresholds)
thresh     <- duplicated(mlg.filter(x, distance = bitwise.dist, 
                                    threshold = the_threshold, 
                                    algo = "a"))
(threshtable <- table(thresh, trueclones))
##        trueclones
## thresh  FALSE TRUE
##   FALSE    75    0
##   TRUE      0    7

The tabulation is a power analysis of how many true and false positives there are when collapsing at the threshold that gives the same number of known clones/replicates.

Below is labelling a tree with known clones.

the_tree <- upgma(bitwise.dist(x))
clones <- substr(the_tree$tip.label[thresh], start = 1, stop = 10)
clones <- lapply(clones, grep, the_tree$tip.label)
edgelist <- length(which.edge(the_tree, the_tree$tip.label))
edgecols <- rep("black", edgelist)
for (i in clones){
  edgecols[which.edge(the_tree, the_tree$tip.label[i])] <- "red"
}
plot.phylo(the_tree, edge.color = edgecols, adj = 0, label.offset = 0.001)
axisPhylo(1)
title("Random sequences with 1000 SNPs and a 0.21 error rate")

nreps <- 100
resarray <- array(data = integer(nreps*4), dim = c(2, 2, nreps), 
                  dimnames = c(dimnames(threshtable), NULL))
neararray <- resarray
fararray  <- resarray
avarray   <- resarray
samplist  <- lapply(1:nreps, function(x) list(samp = NULL, tree = NULL, 
                                              mlgs = NULL))
Sys.time()
## [1] "2015-04-15 12:47:49 PDT"
for (i in 1:nreps){
  set.seed(i) # setting seed for accuracy.
  snps <- rpois(1, 1e3)
  samp1 <- lapply(1:10, getSims, n = 20, snps = snps, strucrat = 1, ploidy = 2, 
                  err = 0.05, na.perc = 0.21, clone = TRUE, n.cores = 4)
  samp2 <- lapply(1:10, getSims, n = 20, snps = snps, strucrat = 1, ploidy = 2, 
                  err = 0.05, na.perc = 0.21, clone = FALSE, n.cores = 4)
  samp <- do.call("rbind", c(samp1, samp2))
  samp@ploidy <- rep(2L, nInd(samp))
  samp <- samp[sample(nInd(samp), nInd(samp)/5)]
  trueclones <- duplicated(substr(indNames(samp), start = 1, stop = 10))
  fstats <- filter_stats(samp, bitwise.dist, plot = TRUE)
  # the_threshold <- fstats$average$thresholds[sum(trueclones)] + .Machine$double.eps^0.5
  title(paste("seed:", i, "n:", nInd(samp), "snps:", snps))
  the_threshold <- threshold_predictor(fstats$average$thresholds)
  the_distance  <- bitwise.dist(x)
  z <- filter_stats(x = samp, distance = bitwise.dist, 
                    threshold = the_threshold, stats = "MLGs")
  abline(v = the_threshold, lty = 2)
  text(the_threshold, 0, 
       labels = paste("Threshold:", signif(the_threshold, 3)), 
       adj = 0)
  samplist[[i]]$samp <- samp
  samplist[[i]]$tree <- upgma(the_distance)
  samplist[[i]]$mlgs <- z
  
  athresh <- duplicated(z$average)
  nthresh <- duplicated(z$nearest)
  fthresh <- duplicated(z$farthest)
  
  avarray[, , i] <- table(athresh, trueclones)
  avarray[, , i] <- sweep(avarray[, , i], 2, colSums(avarray[, , i]), "/")

  neararray[, , i] <- table(nthresh, trueclones)
  neararray[, , i] <- sweep(neararray[, , i], 2, colSums(neararray[, , i]), "/")

  fararray[, , i] <- table(fthresh, trueclones)
  fararray[, , i] <- sweep(fararray[, , i], 2, colSums(fararray[, , i]), "/")
}

Results

Sys.time()
## [1] "2015-04-15 12:52:57 PDT"
# color_mlg_tree(samp, upgma(bitwise.dist(samp)), z$average)
# axisPhylo(1)

Now we get to see how well we did.

(ares <- apply(avarray, 1:2, mean))
##        trueclones
## thresh       FALSE       TRUE
##   FALSE 0.97782633 0.02488684
##   TRUE  0.02217367 0.97511316
(nres <- apply(neararray, 1:2, mean))
##        trueclones
## thresh       FALSE       TRUE
##   FALSE 0.97757633 0.02463684
##   TRUE  0.02242367 0.97536316
(fres <- apply(fararray, 1:2, mean))
##        trueclones
## thresh       FALSE       TRUE
##   FALSE 0.97795133 0.02501184
##   TRUE  0.02204867 0.97498816
True Positive % False Positive %
farthest 97.5 2.22
average 97.5 2.24
nearest 97.5 2.20

RAD seq data

Note that this data has no reference and has a lot of error. There are 10 technical replicates. Each file represents a different parameter used for STACKS.

plinklist <- list(m3 = character(0), m4 = character(0), m10 = character(0), def = character(0))
plinklist[["m4"]]  <- "2R/PopSamples/data.out/PopSamples_m4/Popsouts_Rselec/out.replicates/plink.raw"
plinklist[["def"]] <- "2R/PopSamples/data.out/PopSamples_def/Popsouts_Rselec/out.replicates/plink.raw"
plinklist[["m3"]]  <- "2R/PopSamples/data.out/PopSamples_m3/Popsouts_Rselec/out.replicates/plink.raw"
plinklist[["m10"]] <- "2R/PopSamples/data.out/PopSamples_m10/Popsouts_Rselec/out.replicates/plink.raw"



contlist   <- plinklist # Contingency tables
threshlist <- plinklist # Threshold stats

Steps: 1. read in data 2. mlg.filter on all algorithms and plot the thresholds. 3. create the contingency table for each output (using UPGMA method).

Note for each plot regarding the MLG filter:

  • Red: Nearest Neighbor clustering
  • Blue: Farthest Neighbor clustering
  • Black: UPGMA clustering (average neighbor)

The dotted lines represent the threshold at which the algorithms each creates 10 clusters.

for (i in names(plinklist)){
  barb <- read.PLINK(paste(RAD_data_dir, plinklist[[i]], sep = "/"))
  show(barb)
  fstats <- filter_stats(barb, bitwise.dist, plot = TRUE, nclone = nInd(barb) - 10)
  title(paste(i, "SNPS:", nLoc(barb)))
  minthresh  <- fstats$average$thresholds[10] + .Machine$double.eps^0.5
#   nearthresh  <- fstats$nearest$thresholds[10] + .Machine$double.eps^0.5
#   farthresh  <- fstats$farthest$thresholds[10] + .Machine$double.eps^0.5
#   abline(v = minthresh, lty = 2)
#   abline(v = nearthresh, lty = 2, col = "red")
#   abline(v = farthresh, lty = 2, col = "blue")
#   legend("topright", legend = c("Nearest Neighbor", "UPGMA", "Farthest Neighbor"), 
#          col = c("red", "black", "blue"), pch = 1, title = "Clustering Method")
  thresh     <- mlg.filter(barb, distance = bitwise.dist, algorithm = "a", 
                           threshold = minthresh)
  trueclones <- vapply(strsplit(indNames(barb), "_"), "[[", character(1), 1)
  trueclones <- duplicated(trueclones)
  thresh     <- duplicated(thresh)
  contlist[[i]] <- table(thresh, trueclones)
  threshlist[[i]] <- fstats$average$thresholds
}
## 
##  Reading PLINK raw format into a genlight object... 
## 
## 
##  Reading loci information... 
## 
##  Reading and converting genotypes... 
## .
##  Building final object... 
## 
## ...done.
## 
##  === S4 class genlight ===
##  77 genotypes,  11057 binary SNPs
##  Ploidy: 2
##  170008 (0.2 %) missing data
##  @pop: individual membership for 9 populations
##  @loc.names: labels of the SNPs
##  @other: a list containing: sex  phenotype  pat  mat

## 
##  Reading PLINK raw format into a genlight object... 
## 
## 
##  Reading loci information... 
## 
##  Reading and converting genotypes... 
## .
##  Building final object... 
## 
## ...done.
## 
##  === S4 class genlight ===
##  77 genotypes,  4353 binary SNPs
##  Ploidy: 2
##  69547 (0.21 %) missing data
##  @pop: individual membership for 9 populations
##  @loc.names: labels of the SNPs
##  @other: a list containing: sex  phenotype  pat  mat

## 
##  Reading PLINK raw format into a genlight object... 
## 
## 
##  Reading loci information... 
## 
##  Reading and converting genotypes... 
## .
##  Building final object... 
## 
## ...done.
## 
##  === S4 class genlight ===
##  78 genotypes,  502 binary SNPs
##  Ploidy: 2
##  6359 (0.16 %) missing data
##  @pop: individual membership for 9 populations
##  @loc.names: labels of the SNPs
##  @other: a list containing: sex  phenotype  pat  mat

## 
##  Reading PLINK raw format into a genlight object... 
## 
## 
##  Reading loci information... 
## 
##  Reading and converting genotypes... 
## .
##  Building final object... 
## 
## ...done.
## 
##  === S4 class genlight ===
##  77 genotypes,  7736 binary SNPs
##  Ploidy: 2
##  126467 (0.21 %) missing data
##  @pop: individual membership for 9 populations
##  @loc.names: labels of the SNPs
##  @other: a list containing: sex  phenotype  pat  mat

Print the contingency tables and differences between threshold values to see if there is a large jump indicating a separation between replicates and independent samples.

print(contlist)
## $m3
##        trueclones
## thresh  FALSE TRUE
##   FALSE    60    7
##   TRUE      7    3
## 
## $m4
##        trueclones
## thresh  FALSE TRUE
##   FALSE    61    6
##   TRUE      6    4
## 
## $m10
##        trueclones
## thresh  FALSE TRUE
##   FALSE    57    8
##   TRUE     11    2
## 
## $def
##        trueclones
## thresh  FALSE TRUE
##   FALSE    61    6
##   TRUE      6    4
for (i in threshlist){
  plot(diff(i), log = "y")
}
## Warning in xy.coords(x, y, xlabel, ylabel, log): 2 y values <= 0 omitted
## from logarithmic plot

## Warning in xy.coords(x, y, xlabel, ylabel, log): 14 y values <= 0 omitted
## from logarithmic plot

## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted
## from logarithmic plot

No such luck.

Now, we are plotting the tree where 10 samples are collapsed via average neighbor (UPGMA) and color the tips with the true duplicates.

Note about the figure: Tips are colored blue. Internal branches are colored black. If the algorithm found samples with a distance below the threshold, their connecting branches are colored red. The duplicated samples have red labels

defupgma <- phangorn::upgma(bitwise.dist(barb))

z <- filter_stats(barb, bitwise.dist, threshold = minthresh, stats = "MLGS")
barbnames <- vapply(strsplit(indNames(barb), "_"), "[[", character(1), 1)
dupes <- barbnames[duplicated(barbnames)]
thecols <- ifelse(barbnames %in% dupes, "red", "black")
color_mlg_tree(barb, defupgma, z$average, tip.color = thecols)
axisPhylo(1)